library(tidyverse)
## ── Attaching packages ────────
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.1 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ─────────────────
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(forcats)
#install.packages("plotly")
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
load('infant.RData')
#infant %>% View()
# NA ----------------------------------------------------------------------
a <- c(NA, 1, 4, NA)
sum(is.na(a))/length(a)
## [1] 0.5
infant %>%
summarise_all(
funs(sum(is.na(.))/length(.))
) %>%
gather() %>%
ggplot(aes(x = key, y = value)) +
geom_col() +
coord_flip()

infant %>%
group_by(date_of_delivery_y) %>%
summarise_all(
funs(sum(is.na(.))/length(.))
) %>%
gather(... = -date_of_delivery_y) %>%
ggplot(aes(x = key, y = value)) +
geom_col() +
coord_flip() +
facet_wrap(~date_of_delivery_y)

infant %>%
group_by(date_of_delivery_y) %>%
summarise_all(
funs(sum(is.na(.))/length(.))
) %>%
gather(... = -date_of_delivery_y) %>%
ggplot(aes(x = date_of_delivery_y,
y = value)) +
geom_line(aes(color = key))

#ggplotly()
- missings increase after 2005
- missings in education of mother
infant %>%
group_by(race_and_hispanic_orig_of_mother_c4) %>%
summarise_all(
funs(sum(is.na(.))/length(.))
) %>%
gather(... = -race_and_hispanic_orig_of_mother_c4) %>%
ggplot(aes(x = value, y = key,
color = race_and_hispanic_orig_of_mother_c4)) +
geom_point()

- no sex specific missings
- missings dependend on race and origin
# Infant mortality --------------------------------------------------------
infant <-
infant %>%
mutate(death = ifelse(is.na(age_at_death_d), 0, 1))
infant %>%
group_by(date_of_delivery_y) %>%
summarise(imr = sum(death)/n())
## # A tibble: 16 x 2
## date_of_delivery_y imr
## <int> <dbl>
## 1 1995 0.00715
## 2 1996 0.00707
## 3 1997 0.00709
## 4 1998 0.00680
## 5 1999 0.00697
## 6 2000 0.00679
## 7 2001 0.00677
## 8 2002 0.00709
## 9 2003 0.00687
## 10 2004 0.00673
## 11 2005 0.00663
## 12 2006 0.00672
## 13 2007 0.00693
## 14 2008 0.00631
## 15 2009 0.00622
## 16 2010 0.00623
# aggregate(death ~ date_of_delivery_y,
# FUN = function (x) {sum(x)/length(x)},
# data = infant)
infant %>%
group_by(date_of_delivery_ym) %>%
summarise(imr = sum(death)/n()) %>%
ggplot(aes(x = date_of_delivery_ym, y = imr)) +
geom_line() +
geom_smooth()
## `geom_smooth()` using method = 'loess'

infant %>%
group_by(date_of_delivery_ym, sex) %>%
summarise(imr = sum(death)/n()) %>%
ggplot(aes(x = date_of_delivery_ym, y = imr,
color = sex)) +
geom_line() +
geom_smooth()
## `geom_smooth()` using method = 'loess'

infant %>%
group_by(date_of_delivery_ym, sex) %>%
summarise(imr = sum(death)/n()) %>%
ggplot(aes(x = date_of_delivery_ym, y = imr,
color = sex)) +
geom_line() +
geom_smooth()
## `geom_smooth()` using method = 'loess'

infant %>%
filter(
!is.na(education_of_mother_c2),
!is.na(race_and_hispanic_orig_of_mother_c2)
) %>%
group_by(date_of_delivery_y, sex,
education_of_mother_c2,
race_and_hispanic_orig_of_mother_c2) %>%
summarise(imr = sum(death)/n()) %>%
ggplot(aes(x = date_of_delivery_y, y = imr,
color = sex)) +
geom_line() +
geom_smooth(method = 'lm', se = FALSE) +
facet_grid(education_of_mother_c2 ~ race_and_hispanic_orig_of_mother_c2)

# Gestation at birth ------------------------------------------------------
infant %>%
ggplot() +
geom_histogram(aes(x = gestation_at_delivery_w,
y = ..density..)) +
facet_wrap(~date_of_delivery_y)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 463049 rows containing non-finite values (stat_bin).

a <- 1:6
sum(a>2)/length(a)
## [1] 0.6666667
infant %>%
group_by(date_of_delivery_y) %>%
summarise(
p = sum(gestation_at_delivery_w > 42,
na.rm = TRUE)/n()) %>%
ggplot(aes(x = date_of_delivery_y,
y = p)) +
geom_line()

ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
infant %>%
group_by(date_of_delivery_y) %>%
summarise(
p = sum(gestation_at_delivery_w < 38,
na.rm = TRUE)/n()) %>%
ggplot(aes(x = date_of_delivery_y,
y = p)) +
geom_line()

ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
infant %>%
group_by(date_of_delivery_y) %>%
summarise(
p = sum(gestation_at_delivery_w == 39,
na.rm = TRUE)/n()) %>%
ggplot(aes(x = date_of_delivery_y,
y = p)) +
geom_line()

ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
infant %>%
group_by(date_of_delivery_y) %>%
summarise(
p = sum(gestation_at_delivery_w == 40,
na.rm = TRUE)/n()) %>%
ggplot(aes(x = date_of_delivery_y,
y = p)) +
geom_line()

ggplotly()
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
- coding issue gestation at birth 2003?
# Level and shape of infant mortality -------------------------------------
c(diff(c(1, 5, 6, 7, 9)), NA)
## [1] 4 1 1 2 NA
infant %>%
group_by(date_of_delivery_ym) %>%
summarise(imr = sum(death)/n(),
p = sum(age_at_death_d < 7,
na.rm = TRUE)/sum(death)) %>%
mutate(diff_imr = c(diff(imr), NA),
diff_p = c(diff(p), NA)) %>%
ggplot() +
geom_point(aes(x = diff_imr, y = diff_p))
## Warning: Removed 1 rows containing missing values (geom_point).
